## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 625156 33.4 1434672 76.7 702048 37.5
## Vcells 1171313 9.0 8388608 64.0 1927980 14.8
library(magrittr)
library(data.table)
library(knitr)
library(ggplot2)
library(ComplexUpset)
`%!in%` = Negate(`%in%`)
`%nin%` = Negate(`%in%`)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))fp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ath.gmm = gmm[gmm$plant == 'ath', ]
setDT(ath.gmm)
ath.gmm[, plant := NULL]
ath.gmm[, source := NULL]
colnames(ath.gmm)[2:4] = paste('ath', colnames(ath.gmm)[2:4], sep = '_')note: some duplicated ids in PSS
fp = file.path('..', 'input', 'ath-annot', 'Phytozome', 'PhytozomeV12',
'early_release', 'Athaliana_447_Araport11', 'annotation')
# fn = 'Araport11_GFF3_genes_transposons.current_utf8_attributes_CB.tsv'
fn = 'Athaliana_447_Araport11.geneName.txt'
gn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
colnames(gn)[2] = 'athName'
gn$V1 = sub('\\..*', '', gn$V1)
gn = gn[!duplicated(gn), ]
fn = 'Athaliana_447_Araport11.synonym.txt'
sn = data.table::fread(file.path(fp, fn), header = FALSE, fill = TRUE)
sn[, merged_column := apply(.SD, 1, function(x) {
# Remove NA and empty strings
x = x[!is.na(x) & x != ""]
paste(x, collapse = " | ")
}), .SDcols = 2:ncol(sn)]
# Optionally, remove the original columns V2 to V15
sn[, (2:(ncol(sn)-1)) := NULL]
colnames(sn)[2] = 'athSynonims'
sn$V1 = sub('\\..*', '', sn$V1)
sn = sn[!duplicated(sn), ]
fp = file.path('..', 'input', 'SKM_2025-07-08')
fn = 'rxn-nodes-public.tsv'
pss = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
ind = grep('^name$|^all_pathways|^short_name$', colnames(pss), value = TRUE)
pss = pss[, ..ind]
ind = grep('\\[', pss$name)
pss = pss[ind, ]
pss[, ids_string := stringr::str_extract(name, "(?<=\\[)[^\\]]+(?=\\])")]
pss[, ids_list := strsplit(ids_string, split = ",")]
max_ids = max(lengths(pss$ids_list))
for (i in seq_len(max_ids)) {
pss[[paste0("id_", i)]] = sapply(pss$ids_list, function(x) ifelse(length(x) >= i, x[i], NA))
}
pss[, c("ids_string", "ids_list") := NULL]
pss_long = melt(
pss,
id.vars = c("name", "all_pathways", 'short_name'), # Columns to keep as is
measure.vars = patterns("^id_"), # Columns to melt (all starting with "id_")
variable.name = "id_num", # Name for the melted variable column
value.name = "id" # Name for the melted value column
)
pss_long = pss_long[!is.na(id) & id != ""]
pss_long[, id_num := NULL]
pss_long[, name := NULL]
pss_long$id = sub('\\..*', '', pss_long$id)
pss_long = pss_long[!duplicated(pss_long), ]
table(duplicated(pss_long$id))##
## FALSE TRUE
## 816 24
pss_long %>%
dplyr::filter(id %in% id[duplicated(id)] & stringr::str_starts(id, "^AT")) %>%
dplyr::arrange(id) %>%
print()## all_pathways short_name id
## <char> <char> <char>
## 1: Hormone - Ethylene (ET) ORA59 AT1G06160
## 2: Hormone - Ethylene (ET) ERF/ORA59 AT1G06160
## 3: Hormone - Salicylic acid (SA) TCP8 AT1G58100
## 4: Hormone - Salicylic acid (SA) TCP8,14,15 AT1G58100
## 5: Hormone - Ethylene (ET) EDF2 AT1G68840
## 6: Hormone - Ethylene (ET) ERF/EDF AT1G68840
## 7: Signalling - Heat-shock proteins (HSPs),Stress - Heat HSP70 AT3G12580
## 8: Signalling - Heat-shock proteins (HSPs) HSP AT3G12580
## 9: Hormone - Ethylene (ET) ERF1 AT3G23240
## 10: Hormone - Ethylene (ET) ERF/EDF AT3G23240
## 11: Hormone - Ethylene (ET) ERF6 AT4G17490
## 12: Hormone - Ethylene (ET) ERF/ORA59 AT4G17490
## 13: Hormone - Ethylene (ET) ERF1 AT4G17500
## 14: Hormone - Ethylene (ET) ERF/EDF AT4G17500
## 15: Signalling - Heat-shock proteins (HSPs) MED37E AT5G02500
## 16: Signalling - Heat-shock proteins (HSPs) HSP AT5G02500
## 17: Hormone - Ethylene (ET) ERF096 AT5G43410
## 18: Hormone - Ethylene (ET) ERF/EDF AT5G43410
## 19: Hormone - Ethylene (ET) ERF5 AT5G47230
## 20: Hormone - Ethylene (ET) ERF/ORA59 AT5G47230
## 21: Hormone - Ethylene (ET) ERF105 AT5G51190
## 22: Hormone - Ethylene (ET) ERF/ORA59 AT5G51190
## 23: Signalling - Heat-shock proteins (HSPs) HSP18.1-CI AT5G59720
## 24: Signalling - Heat-shock proteins (HSPs) HSP AT5G59720
## 25: Hormone - Ethylene (ET) ERF104 AT5G61600
## 26: Hormone - Ethylene (ET) ERF/EDF AT5G61600
## all_pathways short_name id
pss_long = pss_long %>%
dplyr::filter(stringr::str_starts(id, "AT")) %>%
dplyr::group_by(id) %>%
dplyr::summarise(
dplyr::across(
.cols = dplyr::everything(),
.fns = ~ {
vals = unique(na.omit(.))
if (length(vals) > 1) paste(vals, collapse = " | ")
else if (length(vals) == 1) vals
else NA_character_
}
),
.groups = "drop"
)
pss_long[pss_long == ""] = "-"
gn[is.na(gn)] = "-"
sn[is.na(sn)] = "-"
ath.annot = pss_long %>%
dplyr::full_join(gn, by = c("id" = "V1")) %>%
dplyr::full_join(sn, by = c("id" = "V1"))Note: 35.2 bin matcheswill be ignored
| Plant Name | Label | JCVI-MCScan | Compara Plants | Plaza | OrthoDB | FastOma | RBH | Mercator |
|---|---|---|---|---|---|---|---|---|
| Malus domestica | apple | mdo_GDv1 | malus_domestica_golden | mdo | mdo | mdo | mdo | mdo |
| mdo_HChap1 | ||||||||
| Prunus persica | ppe | ppe | prunus_persica | ppe | ppe | pper | ppe | pper |
| Prunus dulcis / P. amygdalus | almond | almond | prunus_dulcis | pdul | pdul | pdul | pdul | |
| Prunus avium | wild cherry | wildcherry | prunus_avium | pavi | pavi | pavi | pavi | |
| Prunus armeniaca | apricot | apricot | parm | parm | parm | parm | ||
| Prunus cerasifera | cherry plum | cherryplum | pcer | pcer | pcer | |||
| Pyrus | pear | pear | pcox | pcox | pcox | |||
| Prunus sibirica | Siberian apricot | siberianapricot | psib | psib | psib | psib |
params_list <- list(
plantName = 'mdo'
, plantNameOut = "apple"
, plantDirOut = file.path('..', 'reports', group, "apple")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000002101a545080>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-10] | |…… | 11% | |…….. | 15% [unnamed-chunk-11] | |……… | 19% | |……….. | 22% [unnamed-chunk-12] | |…………. | 26% | |…………… | 30% [unnamed-chunk-13] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-14] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-15] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-16] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-17] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-18] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-19] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-20] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-21] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-22] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 20997 77128 32036 112138 30068 37682
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT4G10330 ath Maldo.hc.v1a1.ch10A.g00003 mdo FastOMA
## 2 AT4G10340 ath Maldo.hc.v1a1.ch10A.g00004 mdo FastOMA
## 3 AT5G44410 ath g1 mdo FastOMA
## 4 AT5G44440 ath g1 mdo FastOMA
## 5 AT1G03770 ath Maldo.hc.v1a1.ch10A.g02825 mdo MCScanX
## 6 AT1G03800 ath Maldo.hc.v1a1.ch10A.g02815 mdo MCScanX
## 7 ATMG00910 ath Maldo.hc.v1a1.sc45A.g49878 mdo MCScanX
## 8 ATMG01020 ath Maldo.hc.v1a1.sc45A.g49883 mdo MCScanX
## 9 AT2G46320 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 10 AT4G27940 ath Maldo.hc.v1a1.ch1A.g26266 mdo OrthoDB
## 11 AT2G07695 ath Maldo.hc.v1a1.sc164A.g48922 mdo OrthoDB
## 12 AT2G07695 ath Maldo.hc.v1a1.sc119A.g48697 mdo OrthoDB
## 13 AT2G28190 ath Maldo.hc.v1a1.ch6A.g38979 mdo PLAZA
## 14 AT2G07732 ath Maldo.hc.v1a1.sc90A.g49976 mdo PLAZA
## 15 AT5G17400 ath Maldo.hc.v1a1.ch9A.g48118 mdo PLAZA
## 16 AT2G47300 ath Maldo.hc.v1a1.ch17A.g24110 mdo PLAZA
## 17 AT1G01020 ath Maldo.hc.v1a1.ch7A.g41990 mdo RBH
## 18 AT1G01030 ath Maldo.hc.v1a1.ch1A.g25187 mdo RBH
## 19 ATMG01410 ath Maldo.hc.v1a1.sc36A.g49760 mdo RBH
## 20 ATMG01410 ath Maldo.hc.v1a1.sc71A.g49908 mdo RBH
## 21 AT4G39400 ath Maldo.hc.v1a1.ch2A.g27501 mdo compara
## 22 AT4G39400 ath Maldo.hc.v1a1.ch15A.g17036 mdo compara
## 23 AT1G65880 ath Maldo.hc.v1a1.ch17A.g24066 mdo compara
## 24 AT5G52820 ath Maldo.hc.v1a1.ch17A.g24067 mdo compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2250109 120.2 4385465 234.3 3000814 160.3
## Vcells 20054457 153.1 32427276 247.5 32422094 247.4
## [1] 23046
## [1] 34410
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 20997 77128 32036 56069 30068 37682
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ComplexUpset package.
## Please report the issue at
## <https://github.com/krassowski/complex-upset/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 117481 27
## [1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
## [16] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8153626 435.5 14083611 752.2 14083611 752.2
## Vcells 127008260 969.0 196766967 1501.3 196766967 1501.3
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 23180
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23046
## # mdo unigue genes: 34410
## # ath highest connection degree: 128
## # mdo highest connection degree: 116
## # genes in ath with degree > 30: 319
## # genes in mdo with degree > 30: 288
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 72085 23180 13328 8915
##
## 100% match bad MapMan no match partial match
## 1 36847 10783 10916 7960
## 2 9865 3217 1663 576
## 3 6786 2076 473 178
## 4 6739 2218 150 104
## 5 7352 2776 96 69
## 6 4496 2110 30 28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 3003 3905 1103
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 37256 12293 3640 2267
##
## 100% match bad MapMan no match partial match
## 1 9275 1870 1615 1604
## 2 5288 2021 1354 341
## 3 5006 1595 402 135
## 4 6010 2003 144 95
## 5 7181 2694 95 64
## 6 4496 2110 30 28
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 20470
## # mdo unigue genes: 31752
## # ath highest connection degree: 59
## # mdo highest connection degree: 102
## # genes in ath with degree > 30: 28
## # genes in mdo with degree > 30: 25
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 32036
## 2: compara 5278
## 3: PLAZA 6568
## 4: OrthoDB_FastOMA_RBH 579
## 5: OrthoDB_RBH 667
## 6: FastOMA_OrthoDB 1240
## 7: FastOMA_RBH 1077
## 8: OrthoDB_MapMan4 3003
## 9: RBH_MapMan4 1103
## 10: FastOMA_MapMan4 3905
## 11: reject 62052
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 163 121 71 35
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 1336 26 118 30
## PLAZA RBH_MapMan4 reject
## 264 28 1753
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'ppe'
, plantNameOut = "peach"
, plantDirOut = file.path('..', 'reports', group, "peach")
, flag = 1
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000210ad79ddd8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-38] | |…… | 11% | |…….. | 15% [unnamed-chunk-39] | |……… | 19% | |……….. | 22% [unnamed-chunk-40] | |…………. | 26% | |…………… | 30% [unnamed-chunk-41] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-42] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-43] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-44] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-45] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-46] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-47] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-48] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-49] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-50] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 16193 44006 17894 76740 20791 24564
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 24 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath Prupe.1G000500 ppe FastOMA
## 2 AT1G62440 ath Prupe.1G000500 ppe FastOMA
## 3 AT1G61010 ath Prupe.I006100 ppe FastOMA
## 4 AT1G61010 ath Prupe.I006200 ppe FastOMA
## 5 AT1G04230 ath Prupe.1G027200 ppe MCScanX
## 6 AT1G04240 ath Prupe.1G027500 ppe MCScanX
## 7 ATCG00530 ath Prupe.7G029500 ppe MCScanX
## 8 ATCG00680 ath Prupe.7G029800 ppe MCScanX
## 9 AT3G17900 ath Prupe.1G267800 ppe OrthoDB
## 10 AT4G35230 ath Prupe.1G355500 ppe OrthoDB
## 11 AT2G07675 ath Prupe.6G146800 ppe OrthoDB
## 12 ATMG00980 ath Prupe.6G146800 ppe OrthoDB
## 13 AT3G02890 ath Prupe.2G329000 ppe PLAZA
## 14 AT5G16680 ath Prupe.2G329000 ppe PLAZA
## 15 AT4G03170 ath Prupe.4G260600 ppe PLAZA
## 16 AT4G03160 ath Prupe.4G260600 ppe PLAZA
## 17 AT1G01030 ath Prupe.5G134900 ppe RBH
## 18 AT1G01040 ath Prupe.2G200900 ppe RBH
## 19 ATMG01250 ath Prupe.6G123900 ppe RBH
## 20 ATMG01250 ath Prupe.7G164000 ppe RBH
## 21 AT5G01010 ath Prupe.6G300700 ppe compara
## 22 AT5G01020 ath Prupe.6G300300 ppe compara
## 23 AT1G80940 ath Prupe.3G103600 ppe compara
## 24 AT1G80950 ath Prupe.1G382400 ppe compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6073948 324.4 11266889 601.8 14083611 752.2
## Vcells 131792040 1005.5 226873834 1731.0 196766967 1501.3
## [1] 23113
## [1] 21245
##
## compara FastOMA MCScanX OrthoDB PLAZA RBH
## 16193 44006 17894 38370 20791 24564
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "PLAZA" "RBH" "compara"
## [9] "from_count" "to_count" "count_evidence" "ath_BINCODE"
## [13] "ath_NAME" "ath_DESCRIPTION" "all_pathways" "short_name"
## [17] "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 71222 231
## [1] "Prupe.I000100" "Prupe.I000200" "Prupe.I000200" "Prupe.I000200"
## [5] "Prupe.I000200" "Prupe.I000200" "Prupe.I000200" "Prupe.I000300"
## [9] "Prupe.I000300" "Prupe.I000300" "Prupe.I000400" "Prupe.I000400"
## [13] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [17] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [21] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [25] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [29] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [33] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [37] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [41] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [45] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000500"
## [49] "Prupe.I000500" "Prupe.I000500" "Prupe.I000500" "Prupe.I000600"
## [53] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [57] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [61] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [65] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [69] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [73] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [77] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [81] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [85] "Prupe.I000600" "Prupe.I000600" "Prupe.I000600" "Prupe.I000600"
## [89] "Prupe.I000600" "Prupe.I000600" "Prupe.I000700" "Prupe.I000700"
## [93] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [97] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [101] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [105] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [109] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [113] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [117] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [121] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [125] "Prupe.I000700" "Prupe.I000700" "Prupe.I000700" "Prupe.I000700"
## [129] "Prupe.I000700" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [133] "Prupe.I000800" "Prupe.I000800" "Prupe.I000800" "Prupe.I000800"
## [137] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I000900"
## [141] "Prupe.I000900" "Prupe.I000900" "Prupe.I000900" "Prupe.I001000"
## [145] "Prupe.I001000" "Prupe.I001000" "Prupe.I001000" "Prupe.I001000"
## [149] "Prupe.I001000" "Prupe.I001000" "Prupe.I001100" "Prupe.I001100"
## [153] "Prupe.I001100" "Prupe.I001600" "Prupe.I001700" "Prupe.I001700"
## [157] "Prupe.I001700" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [161] "Prupe.I001800" "Prupe.I001800" "Prupe.I001800" "Prupe.I001800"
## [165] "Prupe.I001800" "Prupe.I001800" "Prupe.I001900" "Prupe.I001900"
## [169] "Prupe.I002100" "Prupe.I002100" "Prupe.I002300" "Prupe.I002300"
## [173] "Prupe.I002300" "Prupe.I002400" "Prupe.I002400" "Prupe.I002600"
## [177] "Prupe.I002600" "Prupe.I002800" "Prupe.I002800" "Prupe.I002900"
## [181] "Prupe.I002900" "Prupe.I003000" "Prupe.I003000" "Prupe.I003100"
## [185] "Prupe.I003100" "Prupe.I003100" "Prupe.I003200" "Prupe.I003200"
## [189] "Prupe.I003200" "Prupe.I003300" "Prupe.I003400" "Prupe.I003400"
## [193] "Prupe.I003400" "Prupe.I003400" "Prupe.I003400" "Prupe.I003400"
## [197] "Prupe.I003400" "Prupe.I003800" "Prupe.I003800" "Prupe.I003900"
## [201] "Prupe.I003900" "Prupe.I004000" "Prupe.I004000" "Prupe.I004400"
## [205] "Prupe.I004400" "Prupe.I004400" "Prupe.I004400" "Prupe.I004400"
## [209] "Prupe.I004400" "Prupe.I004500" "Prupe.I004500" "Prupe.I004500"
## [213] "Prupe.I004500" "Prupe.I004500" "Prupe.I004500" "Prupe.I005000"
## [217] "Prupe.I005000" "Prupe.I005100" "Prupe.I005200" "Prupe.I005200"
## [221] "Prupe.I005200" "Prupe.I005200" "Prupe.I005200" "Prupe.I005500"
## [225] "Prupe.I005500" "Prupe.I005500" "Prupe.I005600" "Prupe.I005700"
## [229] "Prupe.I005800" "Prupe.I006100" "Prupe.I006200"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 6340525 338.7 11266889 601.8 14083611 752.2
## Vcells 91829013 700.6 226881563 1731.0 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 16428
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 23113
## # ppe unigue genes: 21245
## # ath highest connection degree: 57
## # ppe highest connection degree: 115
## # genes in ath with degree > 30: 264
## # genes in ppe with degree > 30: 135
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 43931 16428 6360 4734
##
## 100% match bad MapMan no match partial match
## 1 21126 7929 5410 4119
## 2 6460 2272 560 368
## 3 4106 1255 185 103
## 4 4182 1339 103 68
## 5 4892 1998 68 51
## 6 3165 1635 34 25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 2181 2171 641
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 23698 8434 1556 1482
##
## 100% match bad MapMan no match partial match
## 1 6038 1402 853 1043
## 2 3413 1431 366 219
## 3 2797 888 143 81
## 4 3607 1168 92 63
## 5 4678 1910 68 51
## 6 3165 1635 34 25
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19985
## # ppe unigue genes: 20047
## # ath highest connection degree: 40
## # ppe highest connection degree: 102
## # genes in ath with degree > 30: 6
## # genes in ppe with degree > 30: 15
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 17894
## 2: compara 5084
## 3: PLAZA 4926
## 4: OrthoDB_FastOMA_RBH 415
## 5: OrthoDB_RBH 516
## 6: FastOMA_OrthoDB 1109
## 7: FastOMA_RBH 233
## 8: OrthoDB_MapMan4 2181
## 9: RBH_MapMan4 641
## 10: FastOMA_MapMan4 2171
## 11: reject 36283
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 145 50 43 7
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 728 27 48 16
## PLAZA RBH_MapMan4 reject
## 166 15 1149
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pdul'
, plantNameOut = "almond"
, plantDirOut = file.path('..', 'reports', group, "almond")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000210aba52270>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-66] | |…… | 11% | |…….. | 15% [unnamed-chunk-67] | |……… | 19% | |……….. | 22% [unnamed-chunk-68] | |…………. | 26% | |…………… | 30% [unnamed-chunk-69] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-70] | |………………… | 41% | |………………….. | 44% [unnamed-chunk-71] | |……………………. | 48% | |…………………….. | 52% [unnamed-chunk-72] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-73] | |………………………….. | 63% | |……………………………. | 67% [unnamed-chunk-74] | |……………………………… | 70% | |……………………………….. | 74% [unnamed-chunk-75] | |…………………………………. | 78% | |…………………………………… | 81% [unnamed-chunk-76] | |……………………………………. | 85% | |……………………………………… | 89% [unnamed-chunk-77] | |……………………………………….. | 93% | |…………………………………………. | 96% [unnamed-chunk-78] | |……………………………………………| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 15975 42927 20148 71468 24829
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G05620 ath TexasF1_G1000 pdul FastOMA
## 2 AT3G48880 ath TexasF1_G10001 pdul FastOMA
## 3 AT3G48890 ath TexasF1_G9999 pdul FastOMA
## 4 AT5G52240 ath TexasF1_G9999 pdul FastOMA
## 5 AT1G04220 ath TexasF1_G767 pdul MCScanX
## 6 AT1G04230 ath TexasF1_G779 pdul MCScanX
## 7 ATCG00170 ath TexasF1_G22620 pdul MCScanX
## 8 ATCG00500 ath TexasF1_G22624 pdul MCScanX
## 9 AT1G23390 ath TexasF1_G3359 pdul OrthoDB
## 10 AT5G19210 ath TexasF1_G2060 pdul OrthoDB
## 11 AT3G51810 ath TexasF1_G23162 pdul OrthoDB
## 12 AT2G28815 ath TexasF1_G6420 pdul OrthoDB
## 13 AT1G01030 ath TexasF1_G18833 pdul RBH
## 14 AT1G01040 ath TexasF1_G9057 pdul RBH
## 15 ATMG00860 ath TexasF1_G25095 pdul RBH
## 16 ATMG01250 ath TexasF1_G22012 pdul RBH
## 17 AT4G37540 ath TexasF1_G22582 pdul compara
## 18 AT5G67420 ath TexasF1_G22582 pdul compara
## 19 AT4G15960 ath TexasF1_G27715 pdul compara
## 20 AT3G45140 ath TexasF1_G6796 pdul compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5041852 269.3 11266889 601.8 14083611 752.2
## Vcells 94094229 717.9 226881563 1731.0 226881563 1731.0
## [1] 22451
## [1] 20903
##
## compara FastOMA MCScanX OrthoDB RBH
## 15975 42927 20148 35734 24829
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "all_pathways" "short_name" "athName"
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 67030
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4675758 249.8 11266889 601.8 14083611 752.2
## Vcells 59036007 450.5 181505251 1384.8 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 14567
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22451
## # pdul unigue genes: 20903
## # ath highest connection degree: 150
## # pdul highest connection degree: 113
## # genes in ath with degree > 30: 171
## # genes in pdul with degree > 30: 120
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 39774 14567 7625 5064
##
## 100% match bad MapMan no match partial match
## 1 19498 7186 6154 4134
## 2 6174 1991 848 468
## 3 4138 1446 289 176
## 4 4954 1821 187 146
## 5 5010 2123 147 140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 2219 2716 826
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 22360 7078 1865 1879
##
## 100% match bad MapMan no match partial match
## 1 6023 694 690 1162
## 2 3432 1344 600 297
## 3 3215 1187 249 144
## 4 4680 1730 179 136
## 5 5010 2123 147 140
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19490
## # pdul unigue genes: 19029
## # ath highest connection degree: 39
## # pdul highest connection degree: 102
## # genes in ath with degree > 30: 5
## # genes in pdul with degree > 30: 13
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 20148
## 2: compara 4234
## 3: OrthoDB_FastOMA_RBH 722
## 4: OrthoDB_RBH 692
## 5: FastOMA_OrthoDB 1038
## 6: FastOMA_RBH 587
## 7: OrthoDB_MapMan4 2219
## 8: RBH_MapMan4 826
## 9: FastOMA_MapMan4 2716
## 10: reject 33848
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 118 68 53 25
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 843 42 63 19
## RBH_MapMan4 reject
## 32 1142
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pavi'
, plantNameOut = "wildcherry"
, plantDirOut = file.path('..', 'reports', group, "wildcherry")
, flag = 2
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000002103781bd60>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-94] | |…… | 11% | |……. | 15% [unnamed-chunk-95] | |……… | 19% | |……….. | 22% [unnamed-chunk-96] | |…………. | 26% | |…………… | 30% [unnamed-chunk-97] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-98] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-99] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-100] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-101] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-102] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-103] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-104] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-105] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-106] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 4367 45924 19708 76456 25594
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 20 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath FUN_000050 pavi FastOMA
## 2 AT1G62440 ath FUN_000050 pavi FastOMA
## 3 AT2G43840 ath FUN_040335 pavi FastOMA
## 4 AT2G44050 ath FUN_040336 pavi FastOMA
## 5 AT1G04220 ath FUN_000300 pavi MCScanX
## 6 AT1G04230 ath FUN_000316 pavi MCScanX
## 7 ATMG01190 ath FUN_026738 pavi MCScanX
## 8 ATMG00910 ath FUN_040205 pavi MCScanX
## 9 AT4G39370 ath FUN_020728 pavi OrthoDB
## 10 AT3G06350 ath FUN_020749 pavi OrthoDB
## 11 AT4G24220 ath FUN_029917 pavi OrthoDB
## 12 AT4G24220 ath FUN_029968 pavi OrthoDB
## 13 AT1G01030 ath FUN_025493 pavi RBH
## 14 AT1G01040 ath FUN_011748 pavi RBH
## 15 ATMG01250 ath FUN_040221 pavi RBH
## 16 ATMG01360 ath FUN_026804 pavi RBH
## 17 AT2G22690 ath FUN_021390 pavi compara
## 18 AT4G37880 ath FUN_021390 pavi compara
## 19 AT4G39770 ath FUN_021012 pavi compara
## 20 AT4G21740 ath FUN_032538 pavi compara
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4085657 218.2 11266889 601.8 14083611 752.2
## Vcells 61491393 469.2 181505251 1384.8 226881563 1731.0
## [1] 22167
## [1] 21950
##
## compara FastOMA MCScanX OrthoDB RBH
## 4367 45924 19708 38228 25594
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "compara" "from_count"
## [9] "to_count" "count_evidence" "ath_BINCODE" "ath_NAME"
## [13] "ath_DESCRIPTION" "all_pathways" "short_name" "athName"
## [17] "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 71643 2
## [1] "FUN_040149" "FUN_040149"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4734923 252.9 11266889 601.8 14083611 752.2
## Vcells 60112170 458.7 181547276 1385.1 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 15130
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22167
## # pavi unigue genes: 21950
## # ath highest connection degree: 122
## # pavi highest connection degree: 115
## # genes in ath with degree > 30: 242
## # genes in pavi with degree > 30: 131
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 41733 15130 8890 5892
##
## 100% match bad MapMan no match partial match
## 1 20986 7805 7489 4881
## 2 7358 2253 946 572
## 3 6191 2096 298 243
## 4 6023 2400 131 163
## 5 1175 576 26 33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 2958 3542 967
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 24400 7051 1933 2488
##
## 100% match bad MapMan no match partial match
## 1 7098 513 732 1692
## 2 4829 1685 770 385
## 3 5345 1909 276 215
## 4 5953 2368 129 163
## 5 1175 576 26 33
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19167
## # pavi unigue genes: 19574
## # ath highest connection degree: 60
## # pavi highest connection degree: 102
## # genes in ath with degree > 30: 9
## # genes in pavi with degree > 30: 22
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 19708
## 2: compara 1309
## 3: OrthoDB_FastOMA_RBH 3052
## 4: OrthoDB_RBH 1307
## 5: FastOMA_OrthoDB 2133
## 6: FastOMA_RBH 896
## 7: OrthoDB_MapMan4 2958
## 8: RBH_MapMan4 967
## 9: FastOMA_MapMan4 3542
## 10: reject 35773
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## compara FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH
## 47 149 79 36
## MCScanX OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH
## 789 84 108 61
## RBH_MapMan4 reject
## 26 1370
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'parm'
, plantNameOut = "apricot"
, plantDirOut = file.path('..', 'reports', group, "apricot")
, flag = 3
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x000002100fcadbf8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-122] | |…… | 11% | |……. | 15% [unnamed-chunk-123] | |……… | 19% | |……….. | 22% [unnamed-chunk-124] | |…………. | 26% | |…………… | 30% [unnamed-chunk-125] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-126] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-127] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-128] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-129] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-130] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-131] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-132] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-133] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-134] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 43038 18615 104168 25259
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 16 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PruarM.1G000500 parm FastOMA
## 2 AT1G62440 ath PruarM.1G000500 parm FastOMA
## 3 AT2G47410 ath PruarM.8G368500 parm FastOMA
## 4 AT4G02060 ath PruarM.8G368700 parm FastOMA
## 5 AT1G04400 ath PruarM.1G051900 parm MCScanX
## 6 AT1G04410 ath PruarM.1G052500 parm MCScanX
## 7 AT5G64410 ath PruarM.8G146300 parm MCScanX
## 8 AT5G64410 ath PruarM.8G146200 parm MCScanX
## 9 AT5G10270 ath PruarM.1G279700 parm OrthoDB
## 10 AT5G64960 ath PruarM.1G279700 parm OrthoDB
## 11 AT2G15790 ath PruarM.8G195500 parm OrthoDB
## 12 AT4G34660 ath PruarM.8G163400 parm OrthoDB
## 13 AT1G01010 ath PruarM.2G368400 parm RBH
## 14 AT1G01030 ath PruarM.5G193300 parm RBH
## 15 ATMG01360 ath PruarM.4G189900 parm RBH
## 16 ATMG01360 ath PruarM.4G190100 parm RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4122851 220.2 11266889 601.8 14083611 752.2
## Vcells 62494882 476.8 181547276 1385.1 226881563 1731.0
## [1] 22351
## [1] 22551
##
## FastOMA MCScanX OrthoDB RBH
## 43038 18615 52084 25259
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "from_count" "to_count"
## [9] "count_evidence" "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION"
## [13] "all_pathways" "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 84042
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4075996 217.7 11266889 601.8 14083611 752.2
## Vcells 48022776 366.4 145237821 1108.1 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 27414
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22351
## # parm unigue genes: 22551
## # ath highest connection degree: 267
## # parm highest connection degree: 113
## # genes in ath with degree > 30: 344
## # genes in parm with degree > 30: 392
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 40370 27414 10212 6046
##
## 100% match bad MapMan no match partial match
## 1 20994 20423 8961 4950
## 2 7132 2400 896 600
## 3 6352 2247 227 306
## 4 5892 2344 128 190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 3175 3319 1196
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 23682 6793 1675 2451
##
## 100% match bad MapMan no match partial match
## 1 7527 545 606 1563
## 2 4655 1845 728 426
## 3 5608 2059 213 272
## 4 5892 2344 128 190
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 18948
## # parm unigue genes: 18528
## # ath highest connection degree: 66
## # parm highest connection degree: 102
## # genes in ath with degree > 30: 10
## # genes in parm with degree > 30: 18
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 18615
## 2: OrthoDB_FastOMA_RBH 3716
## 3: OrthoDB_RBH 1371
## 4: FastOMA_OrthoDB 2357
## 5: FastOMA_RBH 852
## 6: OrthoDB_MapMan4 3175
## 7: RBH_MapMan4 1196
## 8: FastOMA_MapMan4 3319
## 9: reject 49441
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH MCScanX
## 154 110 23 768
## OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH RBH_MapMan4
## 123 128 56 31
## reject
## 1482
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcox'
, plantNameOut = "pear"
, plantDirOut = file.path('..', 'reports', group, "pear")
, flag = 4
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000210a3d35938>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-150] | |…… | 11% | |……. | 15% [unnamed-chunk-151] | |……… | 19% | |……….. | 22% [unnamed-chunk-152] | |…………. | 26% | |…………… | 30% [unnamed-chunk-153] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-154] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-155] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-156] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-157] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-158] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-159] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-160] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-161] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-162] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 75969 33983 36090
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G20520 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 2 AT1G76210 ath Pyrco.da.v2a1.augustus.000230 pcox FastOMA
## 3 AT5G53090 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 4 AT5G53100 ath Pyrco.da.v2a1.snap.445350 pcox FastOMA
## 5 AT1G03900 ath Pyrco.da.v2a1.chr10A.089110 pcox MCScanX
## 6 AT1G03920 ath Pyrco.da.v2a1.chr10A.089090 pcox MCScanX
## 7 AT5G56870 ath Pyrco.da.v2a1.chr9A.225970 pcox MCScanX
## 8 AT5G57035 ath Pyrco.da.v2a1.chr9A.225390 pcox MCScanX
## 9 AT1G01030 ath Pyrco.da.v2a1.chr14A.371380 pcox RBH
## 10 AT1G01030 ath Pyrco.da.v2a1.chr1A.345960 pcox RBH
## 11 ATMG01250 ath Pyrco.da.v2a1.chr5A.045340 pcox RBH
## 12 ATMG01250 ath Pyrco.da.v2a1.snap.153710 pcox RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3663517 195.7 11266889 601.8 14083611 752.2
## Vcells 47066163 359.1 145237821 1108.1 226881563 1731.0
## [1] 21603
## [1] 30838
##
## FastOMA MCScanX RBH
## 75969 33983 36090
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "all_pathways"
## [13] "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 95885
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3713411 198.4 11266889 601.8 14083611 752.2
## Vcells 41971878 320.3 145237821 1108.1 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 16546
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 21603
## # pcox unigue genes: 30838
## # ath highest connection degree: 136
## # pcox highest connection degree: 116
## # genes in ath with degree > 30: 261
## # genes in pcox with degree > 30: 287
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 60208 16546 10595 8536
##
## 100% match bad MapMan no match partial match
## 1 36081 8117 9410 7873
## 2 12902 4306 975 468
## 3 11225 4123 210 195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## FastOMA_MapMan4 RBH_MapMan4
## 13297 2564
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 38785 9730 3471 3470
##
## 100% match bad MapMan no match partial match
## 1 16409 1653 2355 2871
## 2 11151 3954 906 404
## 3 11225 4123 210 195
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 19450
## # pcox unigue genes: 28452
## # ath highest connection degree: 60
## # pcox highest connection degree: 102
## # genes in ath with degree > 30: 109
## # genes in pcox with degree > 30: 78
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 33983
## 2: FastOMA_RBH 5612
## 3: RBH_MapMan4 2564
## 4: FastOMA_MapMan4 13297
## 5: reject 40429
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 493 175 1459 109 1068
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'pcer'
, plantNameOut = "cherryplum"
, plantDirOut = file.path('..', 'reports', group, "cherryplum")
, flag = 4
)
# note: in compara - geneID and prot ID are completely different
env <- new.env()
list2env(params_list, envir = env)<environment: 0x00000210a300f0e8>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-178] | |…… | 11% | |……. | 15% [unnamed-chunk-179] | |……… | 19% | |……….. | 22% [unnamed-chunk-180] | |…………. | 26% | |…………… | 30% [unnamed-chunk-181] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-182] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-183] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-184] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-185] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-186] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-187] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-188] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-189] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-190] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 162100 63358 80487
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 12 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT2G32760 ath Pcer_000001-RA pcer FastOMA
## 2 AT1G07920 ath Pcer_000002-RA pcer FastOMA
## 3 AT1G16650 ath Pcer_097557-RA pcer FastOMA
## 4 AT1G53530 ath Pcer_097558-RA pcer FastOMA
## 5 AT1G04220 ath Pcer_000137-RA pcer MCScanX
## 6 AT1G04230 ath Pcer_000150-RA pcer MCScanX
## 7 ATMG00080 ath Pcer_091446-RA pcer MCScanX
## 8 ATMG00513 ath Pcer_091469-RA pcer MCScanX
## 9 AT1G01030 ath Pcer_027461-RA pcer RBH
## 10 AT1G01030 ath Pcer_038773-RA pcer RBH
## 11 ATMG01330 ath Pcer_091451-RA pcer RBH
## 12 ATMG01360 ath Pcer_096779-RA pcer RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3549616 189.6 11266889 601.8 14083611 752.2
## Vcells 45685339 348.6 145237821 1108.1 226881563 1731.0
## [1] 22197
## [1] 71437
##
## FastOMA MCScanX RBH
## 162100 63358 80487
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "RBH" "from_count" "to_count" "count_evidence"
## [9] "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION" "all_pathways"
## [13] "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE TRUE
## 201293 3
## [1] "Pcer_097367-RB" "Pcer_097392-RB" "Pcer_097544-RB"
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4229286 225.9 11266889 601.8 14083611 752.2
## Vcells 57585409 439.4 145338077 1108.9 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 40662
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22197
## # pcer unigue genes: 71437
## # ath highest connection degree: 270
## # pcer highest connection degree: 113
## # genes in ath with degree > 30: 890
## # genes in pcer with degree > 30: 449
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 126211 40662 19225 15198
##
## 100% match bad MapMan no match partial match
## 1 75114 21956 16707 13585
## 2 29613 10371 1995 1240
## 3 21484 8335 523 373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## FastOMA_MapMan4 RBH_MapMan4
## 34806 6865
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 89442 20615 4446 8578
##
## 100% match bad MapMan no match partial match
## 1 41943 2600 2064 7103
## 2 26015 9680 1859 1102
## 3 21484 8335 523 373
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 20165
## # pcer unigue genes: 63906
## # ath highest connection degree: 209
## # pcer highest connection degree: 102
## # genes in ath with degree > 30: 461
## # genes in pcer with degree > 30: 166
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 63358
## 2: FastOMA_RBH 18052
## 3: RBH_MapMan4 6865
## 4: FastOMA_MapMan4 34806
## 5: reject 78215
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_MapMan4 FastOMA_RBH MCScanX RBH_MapMan4 reject
## 1418 583 2541 266 2659
openxlsx::write.xlsx(pss,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_pss_orthologues-kept-rejected_2025-09-15.xlsx'),
asTable = TRUE)params_list <- list(
plantName = 'psib'
, plantNameOut = "siberianapricot"
, plantDirOut = file.path('..', 'reports', group, "siberianapricot")
, flag = 3
)
env <- new.env()
list2env(params_list, envir = env)<environment: 0x0000021055ab0768>
##
##
## processing file: ./09_translate-child.Rmd
| | | 0% | |.. | 4% | |…. | 7% [unnamed-chunk-206] | |…… | 11% | |……. | 15% [unnamed-chunk-207] | |……… | 19% | |……….. | 22% [unnamed-chunk-208] | |…………. | 26% | |…………… | 30% [unnamed-chunk-209] | |…………….. | 33% | |………………. | 37% [unnamed-chunk-210] | |……………….. | 41% | |…………………. | 44% [unnamed-chunk-211] | |…………………… | 48% | |…………………….. | 52% [unnamed-chunk-212] | |………………………. | 56% | |………………………… | 59% [unnamed-chunk-213] | |…………………………. | 63% | |…………………………… | 67% [unnamed-chunk-214] | |…………………………….. | 70% | |………………………………. | 74% [unnamed-chunk-215] | |………………………………… | 78% | |………………………………….. | 81% [unnamed-chunk-216] | |……………………………………. | 85% | |…………………………………….. | 89% [unnamed-chunk-217] | |………………………………………. | 93% | |………………………………………… | 96% [unnamed-chunk-218] | |…………………………………………..| 100%
fp = file.path('..', 'intermediate')
fl = list.files(fp, full.names = TRUE)
fl = fl[grep(paste0('_cleaned'), fl)] # change names
fl = fl[grep('\\.zip$', fl)]
df = NULL
for (i in fl){
print(i)
dt = data.table::fread(i)
us = unique(dt$source)
if(us == 'compara') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'FastOMA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'MCScanX') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'PLAZA') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'OrthoDB') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else if (us == 'RBH') {
dt = dt[dt$to_plant == plantName, ]
df = rbind(df, dt)
} else cat ('Ignore source(s):', us, '\n')
}## [1] "../intermediate/_OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/comparaPlants-hc_cleaned.txt.zip"
## [1] "../intermediate/FastOMA_cleaned.txt.zip"
## [1] "../intermediate/JCVI_MCScanX_cleaned.txt.zip"
## [1] "../intermediate/mercator_cleaned.txt.zip"
## Ignore source(s): Mercator
## [1] "../intermediate/OrthoDB_cleaned.txt.zip"
## [1] "../intermediate/PLAZA_cleaned.txt.zip"
## [1] "../intermediate/RBH_cleaned.txt.zip"
## Pre filter Sources:
## 40732 16398 20889 25288
colnames(df)[1] = "from_geneID"
colnames(df)[3] = "to_geneID"
df %>%
dplyr::group_by(source) %>%
dplyr::slice_head(n = 2) %>%
dplyr::bind_rows(df %>% dplyr::group_by(source) %>% dplyr::slice_tail(n = 2)) %>%
dplyr::arrange(source) %>%
dplyr::ungroup() -> first_last_three_per_source
print(first_last_three_per_source, n = nrow(first_last_three_per_source))## # A tibble: 16 × 5
## from_geneID from_plant to_geneID to_plant source
## <chr> <chr> <chr> <chr> <chr>
## 1 AT1G12040 ath PaF106G0100000005 psib FastOMA
## 2 AT1G62440 ath PaF106G0100000005 psib FastOMA
## 3 AT3G07140 ath PaF106G0800032954 psib FastOMA
## 4 AT3G07140 ath PaF106G0800032956 psib FastOMA
## 5 AT1G04230 ath PaF106G0100000358 psib MCScanX
## 6 AT1G04240 ath PaF106G0100000361 psib MCScanX
## 7 ATCG00340 ath PaF106G0700028636 psib MCScanX
## 8 ATCG00350 ath PaF106G0700028635 psib MCScanX
## 9 AT2G43200 ath PaF106G0500020125 psib OrthoDB
## 10 AT1G52770 ath PaF106G0300013722 psib OrthoDB
## 11 AT3G56950 ath PaF106G0700028984 psib OrthoDB
## 12 AT5G37530 ath PaF106G0300012640 psib OrthoDB
## 13 AT1G01030 ath PaF106G0500020091 psib RBH
## 14 AT1G01040 ath PaF106G0200009357 psib RBH
## 15 ATMG01190 ath PaF106G0600023358 psib RBH
## 16 ATMG01250 ath PaF106G0700028671 psib RBH
rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3909777 208.9 9013512 481.4 14083611 752.2
## Vcells 63705863 486.1 118542936 904.5 226881563 1731.0
## [1] 22040
## [1] 20342
##
## FastOMA MCScanX OrthoDB RBH
## 40732 16398 20889 25288
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) {
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
dt.wide[, count_evidence := rowSums(.SD), .SDcols = source_cols]
hist(dt.wide$count_evidence, main = paste0('# ath-', plantName, ' evidence'))dff = as.data.frame(dt.wide)
upset_plot = upset(
dff,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods") +
theme(legend.position = "none")
# Print or/and save the plot
print(upset_plot)# counting occurences
from_counts = dt.wide[, .N, by = from_geneID]
setnames(from_counts, "N", "from_count")
to_counts = dt.wide[, .N, by = to_geneID]
setnames(to_counts, "N", "to_count")
dt.wide = merge(dt.wide, to_counts, by = "to_geneID", all.x = TRUE)
dt.wide = merge(dt.wide, from_counts, by = "from_geneID", all.x = TRUE)
ind = c(grep('from_geneID|to_geneID|FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara', colnames(dt.wide)),
grep('from_count', colnames(dt.wide)),
grep('to_count', colnames(dt.wide)),
grep('count_evidence', colnames(dt.wide)))
##### take care here
dt.wide = dt.wide[, ..ind]df = merge(dt.wide, ath.gmm, by.x = 'from_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
df = merge(df, ath.annot, by.x = 'from_geneID', by.y = 'id', all.x = TRUE)
nin = ath.annot[which(!(ath.annot$id[!is.na(ath.annot$all_pathways)] %in% df$from_geneID)), ]
nin = nin[grep('^AT', nin$id), ]
nin = merge(nin, ath.gmm, by.x = 'id', by.y = 'IDENTIFIER', all.x = TRUE)
setorder(nin, short_name)
openxlsx::write.xlsx(nin,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut , '-ath_pss_no-orthologues_2025-09-15.xlsx'),
asTable = TRUE) # change namefp = file.path('..', 'intermediate')
fn = 'mercator_cleaned.txt.zip'
gmm = data.table::fread(file.path(fp, fn), header = TRUE, fill = TRUE)
combined = gmm[gmm$plant == plantName, ]
setDT(combined)
combined[, plant := NULL]
combined[, source := NULL]
colnames(combined)[2:4] = paste('plant', colnames(combined)[2:4], sep = '_')
colnames(df)## [1] "from_geneID" "to_geneID" "FastOMA" "MCScanX"
## [5] "OrthoDB" "RBH" "from_count" "to_count"
## [9] "count_evidence" "ath_BINCODE" "ath_NAME" "ath_DESCRIPTION"
## [13] "all_pathways" "short_name" "athName" "athSynonims"
dt = merge(df, combined, by.x = 'to_geneID', by.y = 'IDENTIFIER', all.x = TRUE, all.y = FALSE)
table(is.na(dt$plant_BINCODE))##
## FALSE
## 58143
## character(0)
dt_cols = colnames(df)
new_cols = setdiff(colnames(dt), c(dt_cols))
dt = as.data.frame(dt)
df = dt[, c(dt_cols, new_cols)]rm(list = setdiff(ls(), c("df",
"ath.gmm", "ath.annot",
"group",
"plantName",
"plantNameOut",
"plantDirOut",
"flag")))
gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3843785 205.3 9013512 481.4 14083611 752.2
## Vcells 42433627 323.8 118542936 904.5 226881563 1731.0
MapMan Mercator matches: first three levels only
df = df[!duplicated(df), ]
compare_bin <- function(athMercator, plantXMercator) {
# Helper: Split and truncate tokens to first 3 dot-separated levels
split_tokens <- function(code) {
if (is.na(code) || trimws(code) == "") return(character(0))
parts <- unlist(strsplit(code, "\\|"))
tokens <- unlist(strsplit(parts, ";"))
tokens <- unique(trimws(tokens))
trunc3levels <- function(token) {
levels <- unlist(strsplit(token, "\\."))
paste(head(levels, 3), collapse = ".")
}
unique(sapply(tokens, trunc3levels))
}
bin_set <- split_tokens(athMercator)
v4_set <- split_tokens(plantXMercator)
# If both sets are empty, return "no match"
if (length(bin_set) == 0 && length(v4_set) == 0) {
return("no match")
}
# Check for redundant annotation (e.g. "35.2|35.2|35.2")
v4_parts <- unlist(strsplit(plantXMercator, "\\|"))
if (length(bin_set) == 1 &&
length(v4_parts) > 1 &&
all(split_tokens(plantXMercator) == bin_set)) {
result <- paste0("100% match based on ", bin_set)
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for exact match
if (setequal(bin_set, v4_set) && length(bin_set) > 0) {
result <- paste0("100% match based on ", paste(bin_set, collapse = ", "))
if (result == "100% match based on 35.2") return("bad MapMan")
return(result)
}
# Check for partial match
common_tokens <- intersect(bin_set, v4_set)
if (length(common_tokens) > 0) {
return(paste0("partial match based on ", paste(common_tokens, collapse = ", ")))
}
return("no match")
}
df = df %>%
dplyr::rowwise() %>%
dplyr::mutate(MapMan4_Match = compare_bin(ath_BINCODE, plant_BINCODE)) %>% # change name
dplyr::ungroup()
table(df$ath_BINCODE[df$MapMan4_Match == "bad MapMan"], df$plant_BINCODE[df$MapMan4_Match == "bad MapMan"])##
## 35.2
## 35.2 12813
## #### #### before filter #### ####
cat("\t# ath unigue genes:", length(unique(df$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(df$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(df$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(df$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(df$from_geneID[df$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(df$to_geneID[df$to_count > 30])), "\n\n"
)## # ath unigue genes: 22040
## # psib unigue genes: 20342
## # ath highest connection degree: 58
## # psib highest connection degree: 115
## # genes in ath with degree > 30: 135
## # genes in psib with degree > 30: 106
dt = as.data.table(df)
dt[, filter_criteria := "reject"]
covered_genes = character()
if (flag == 1) {
methods = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
methods = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
methods = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
methods = c("MCScanX", "FastOMA", 'RBH')
}
match_categories = c("no match", "100% match", "partial match", "bad MapMan")
long_dt = rbindlist(lapply(methods, function(method) {
dt[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = dt[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 35534 12813 6004 3792
##
## 100% match bad MapMan no match partial match
## 1 18977 6640 5025 3323
## 2 6361 2039 641 312
## 3 6065 2253 249 97
## 4 4131 1881 89 60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-before_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
if (flag != 4) {
special_methods = c("OrthoDB", "FastOMA", 'RBH')
} else {
special_methods = c("FastOMA", 'RBH')
}
# Initialize a named vector to count method_MapMan4 assignments
mapman4_counts = setNames(rep(0, length(special_methods)), paste0(special_methods, "_MapMan4"))
for (method in methods) {
base_cond = dt$filter_criteria == "reject" & dt[[method]] == TRUE &
!(dt$to_geneID %in% covered_genes) & !(dt$from_geneID %in% covered_genes)
add_cond = rep(TRUE, nrow(dt))
if (method %in% special_methods) {
add_cond = rep(TRUE, nrow(dt))
}
candidates = which(base_cond & add_cond)
if (length(candidates) > 0) {
if (method %in% special_methods) {
for (i in candidates) {
row = dt[i]
covered_by = special_methods[sapply(special_methods, function(m) row[[m]] == TRUE)]
count_covered = length(covered_by)
is_candidate = FALSE
new_criteria = NULL
if (count_covered == 3) {
is_candidate = TRUE
new_criteria = "OrthoDB_FastOMA_RBH"
} else if (count_covered == 2) {
is_candidate = TRUE
new_criteria = paste(sort(covered_by), collapse = "_")
} else if (count_covered == 1) {
# Check MapMan4_Match string contains "match based on" and method name (case-insensitive)
if (grepl("match based on", row$MapMan4_Match, ignore.case = TRUE)) {
is_candidate = TRUE
new_criteria = paste0(method, "_MapMan4")
# Increment count for this mapman4 assignment
mapman4_counts[[new_criteria]] = mapman4_counts[[new_criteria]] + 1
}
}
if (is_candidate) {
dt[i, filter_criteria := new_criteria]
# covered_genes = unique(c(covered_genes, row$to_geneID, row$from_geneID)) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, row$to_geneID))
}
}
} else {
dt[candidates, filter_criteria := method]
# covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)], dt[candidates, unique(from_geneID)])) # if you want to remove both IDs from pair
covered_genes = unique(c(covered_genes, dt[candidates, unique(to_geneID)]))
}
}
}
# After the loop, print checkpoint counts for method_MapMan4 assignments
cat('#### #### MapMan4 assignment counts per method #### #### \n')## #### #### MapMan4 assignment counts per method #### ####
## OrthoDB_MapMan4 FastOMA_MapMan4 RBH_MapMan4
## 1102 5266 1947
rejected = df[df$filter_criteria == 'reject', ]
kept = df[df$filter_criteria != 'reject', ]
# Update counts by reference in dt.wide (no merge needed)
setDT(df)
df[, from_count := .N, by = from_geneID]
df[, to_count := .N, by = to_geneID]
kept[, from_count := .N, by = from_geneID]
kept[, to_count := .N, by = to_geneID]
par(mfrow = c(2,2))
xlim = c(0,100)
h1 = hist(df$from_count, plot = FALSE, breaks = "Sturges")
h2 = hist(kept$from_count, plot = FALSE, breaks = "Sturges")
h3 = hist(df$to_count, plot = FALSE, breaks = "Sturges")
h4 = hist(kept$to_count, plot = FALSE, breaks = "Sturges")
max_count = max(c(h1$counts, h2$counts, h3$counts, h4$counts))
hist(df$from_count, main = "ath", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$from_count, main = "ath kept", xlab = "from_count", xlim = xlim, ylim = c(0, max_count))
hist(df$to_count, main = plantName, xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
hist(kept$to_count, main = paste(plantName, "kept"), xlab = "to_count", xlim = xlim, ylim = c(0, max_count))
par(mfrow = c(1,1))
mtext("Degrees before and afer filter", side = 3, line = -1.5, outer = TRUE, cex = 1.5)long_dt = rbindlist(lapply(methods, function(method) {
kept[, .(
Method = method,
Match_Type = match_categories,
Count = c(
sum(get(method) == TRUE & MapMan4_Match == "no match"),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^100% match")),
sum(get(method) == TRUE & stringr::str_detect(MapMan4_Match, "^partial match")),
sum(get(method) == TRUE & MapMan4_Match == "bad MapMan")
)
)]
}), use.names = TRUE)
# Rename "bad MapMan" to "35.2"
long_dt[, Match_Type := factor(
fcase(
as.character(Match_Type) == "bad MapMan", "35.2",
default = as.character(Match_Type)
),
levels = c("no match", "35.2", "partial match", "100% match")
)]
long_dt[, Match_Type := factor(Match_Type, levels = c("no match", "35.2", "partial match", "100% match"))]
ggplot2::ggplot(long_dt, ggplot2::aes(x = Method, y = Count, fill = Match_Type)) +
ggplot2::geom_bar(stat = "identity") +
ggplot2::labs(title = "MapMan match types count per method (post filter)",
x = "Method",
y = "Count",
fill = "Match Type") +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
ggplot2::theme_minimal() +
ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter1.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
dtsub = kept[, .SD, .SDcols = grep("count_evidence|MapMan4_Match", names(dt), value = TRUE)]
dtsub$MapMan4_Match = sub('based on.*', '', dtsub$MapMan4_Match)
table(dtsub$MapMan4_Match)##
## 100% match bad MapMan no match partial match
## 22449 6409 1748 1682
##
## 100% match bad MapMan no match partial match
## 1 8325 859 878 1318
## 2 4482 1595 540 212
## 3 5511 2074 241 92
## 4 4131 1881 89 60
tab = as.data.table(as.data.frame(table(dtsub$count_evidence, dtsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "MapMan4_Match", "Freq"))
tab$MapMan4_Match = stringr::str_trim(as.character(tab$MapMan4_Match), side = "right")
tab$MapMan4_Match <- factor(
ifelse(tab$MapMan4_Match == "bad MapMan", "35.2", tab$MapMan4_Match),
levels = c("no match", "35.2", "partial match", "100% match")
)
ggplot(tab, aes(x = factor(count_evidence), y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
labs(title = "Consistency of MapMan Match by # methods coverage (post filter)",
x = "count_evidence",
y = "Frequency",
fill = "MapMan4_Match") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter2.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
keptsub = kept[, .SD, .SDcols = grep("FastOMA|MCScanX|OrthoDB|PLAZA|RBH|compara|count_evidence|MapMan4_Match|filter_criteria",
names(kept), value = TRUE)]
# rename 'bad MapMan' to '35.2'
keptsub$MapMan4_Match = sub(' based on.*', '', keptsub$MapMan4_Match)
keptsub$MapMan4_Match = ifelse(keptsub$MapMan4_Match == "bad MapMan", "35.2", keptsub$MapMan4_Match)
# frequency table
tab = as.data.table(as.data.frame(table(keptsub$count_evidence, keptsub$filter_criteria, keptsub$MapMan4_Match)))
setnames(tab, c("count_evidence", "filter_criteria", "MapMan4_Match", "Freq"))
# Filter
tab = tab[Freq > 0]
tab[, count_evidence := factor(count_evidence)]
tab[, filter_criteria := factor(filter_criteria, levels = c("MCScanX", "compara", "PLAZA",
"OrthoDB_FastOMA_RBH",
"FastOMA_OrthoDB", "OrthoDB_FastOMA", "OrthoDB_RBH", "FastOMA_RBH",
"OrthoDB_MapMan4", "RBH_MapMan4", "FastOMA_MapMan4"
))]
tab[, MapMan4_Match := factor(MapMan4_Match, levels = c('no match', '35.2', 'partial match', '100% match'))]
ggplot(tab, aes(x = filter_criteria, y = Freq, fill = MapMan4_Match)) +
geom_bar(stat = "identity") +
facet_wrap(~ count_evidence, nrow = 2, drop = TRUE) +
labs(
title = "Selection by method and methods coverage",
x = "Filter criteria",
y = "Frequency",
fill = "MapMan4 Match"
) +
theme_minimal() +
ggplot2::scale_fill_brewer(palette = "Set3", direction = -1) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1),
panel.border = element_rect(color = "black", fill = NA, size = 1), # border around each facet
)ggplot2::ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "-after_filter3.pdf"),
device = "pdf", width = 6, height = 6, units = "in")
openxlsx::write.xlsx(rejected,
paste0('../reports/', group, "/", plantNameOut, '/', plantNameOut, '-ath_orthologues-removed_2025-09-15.xlsx'),
asTable = TRUE)
edges = unique(kept[, .(from_geneID, to_geneID)])
g = igraph::graph_from_data_frame(edges, directed = FALSE)
comp = igraph::components(g)
membership_dt = data.table(
geneID = names(comp$membership),
weak_component = comp$membership
)
# in case of directed graph
kept = merge(kept, membership_dt, by.x = "from_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "from_component")
# kept = merge(kept, membership_dt, by.x = "to_geneID", by.y = "geneID", all.x = TRUE)
# setnames(kept, "weak_component", "to_component")
# # but its undirected
# kept[, weak_component := from_component]
# # cleanup
# kept[, c("from_component", "to_component") := NULL]
openxlsx::write.xlsx(kept,
paste0('../output/', plantNameOut , '-ath_orthologues-kept_2025-09-15.xlsx'),
asTable = TRUE)
if (flag == 1) {
source_cols = c("MCScanX", "compara", "PLAZA", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 2) { # make flags
source_cols = c("MCScanX", "compara", 'OrthoDB', "FastOMA", 'RBH')
} else if (flag == 3) {
source_cols = c("MCScanX", 'OrthoDB', "FastOMA", 'RBH')
} else {
source_cols = c("MCScanX", "FastOMA", 'RBH')
}
# https://krassowski.github.io/complex-upset/articles/Examples_R.html
upset_plot = upset(
kept,
intersect = source_cols,
name = "Source",
width_ratio = 0.1,
base_annotations = list(
'Intersection size' = intersection_size(counts = FALSE) #,
# 'Intersection ratio' = intersection_ratio()
),
# Sort intersections first by degree (number of sets in intersection) descending,
# then by intersection size (cardinality) descending within each degree
sort_intersections_by = c("degree", "cardinality"),
sort_intersections = "descending") +
ggtitle("Overlap of gene pairs supported by multiple methods (after filter)")
# Print or save the plot
print(upset_plot)ggsave(paste0("../reports/", group, "/", plantNameOut, '/', plantNameOut, "_upset_plot_kept_2025-09-15.pdf"),
plot = upset_plot, width = 24, height = 6, device = "pdf")
cat('#### #### after filter #### #### \n')## #### #### after filter #### ####
cat("\t# ath unigue genes:", length(unique(kept$from_geneID)), "\n",
"\t#", plantName, "unigue genes:", length(unique(kept$to_geneID)), "\n",
"\t# ath highest connection degree: ", range(kept$from_count)[2], "\n",
"\t#", plantName, "highest connection degree: ", range(kept$to_count)[2], "\n",
"\t# genes in ath with degree > 30: ", length(unique(kept$from_geneID[kept$from_count > 30])), "\n",
"\t# genes in", plantName, "with degree > 30: ", length(unique(kept$to_geneID[kept$to_count > 30])), "\n\n"
)## # ath unigue genes: 18799
## # psib unigue genes: 18194
## # ath highest connection degree: 43
## # psib highest connection degree: 102
## # genes in ath with degree > 30: 12
## # genes in psib with degree > 30: 24
filter_counts = as.data.table(table(dt$filter_criteria))
setnames(filter_counts, c("filter_criteria", "Count"))
# Define desired order
desired_order = c(
"MCScanX",
"compara",
"PLAZA",
"OrthoDB_FastOMA_RBH",
"OrthoDB_RBH",
"FastOMA_OrthoDB",
"FastOMA_RBH",
"OrthoDB_MapMan4",
"RBH_MapMan4",
"FastOMA_MapMan4",
"reject"
)
filter_counts = filter_counts[filter_criteria %in% desired_order]
filter_counts[, filter_criteria := factor(filter_criteria, levels = desired_order)]
setorder(filter_counts, filter_criteria)
print(filter_counts)## filter_criteria Count
## <fctr> <int>
## 1: MCScanX 16398
## 2: OrthoDB_FastOMA_RBH 3708
## 3: OrthoDB_RBH 819
## 4: FastOMA_OrthoDB 1042
## 5: FastOMA_RBH 2006
## 6: OrthoDB_MapMan4 1102
## 7: RBH_MapMan4 1947
## 8: FastOMA_MapMan4 5266
## 9: reject 25855
pss = ath.annot[which(!is.na(ath.annot$all_pathways)), ]
pss = pss[, grep("id$|all_pathways$|short_name$", colnames(pss))]
pss = pss[!duplicated(pss), ]
pss = merge(pss,
df[, .SD, .SDcols = grep("from_geneID|to_geneID|ath_BINCODE|ath_NAME|ath_DESCRIPTION|athName|athSynonims|MapMan4_Match|filter_criteria",
names(dt), value = TRUE)],
by.x = 'id', by.y = 'from_geneID', all.x = TRUE, all.y = FALSE)
pss = pss[grep('^AT', pss$id), ]
pss = pss[!duplicated(pss), ]
table(pss$filter_criteria)##
## FastOMA_MapMan4 FastOMA_OrthoDB FastOMA_RBH MCScanX
## 262 46 69 674
## OrthoDB_FastOMA_RBH OrthoDB_MapMan4 OrthoDB_RBH RBH_MapMan4
## 104 69 33 87
## reject
## 816
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.utf8 LC_CTYPE=Slovenian_Slovenia.utf8
## [3] LC_MONETARY=Slovenian_Slovenia.utf8 LC_NUMERIC=C
## [5] LC_TIME=Slovenian_Slovenia.utf8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ComplexUpset_1.3.3 ggplot2_4.0.0 knitr_1.50 data.table_1.17.8
## [5] magrittr_2.0.4
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 crayon_1.5.3 dplyr_1.1.4
## [5] compiler_4.5.1 zip_2.3.3 Rcpp_1.1.0 tidyselect_1.2.1
## [9] stringr_1.5.2 jquerylib_0.1.4 scales_1.4.0 yaml_2.3.10
## [13] fastmap_1.2.0 R6_2.6.1 labeling_0.4.3 generics_0.1.4
## [17] patchwork_1.3.2 igraph_2.1.4 openxlsx_4.2.8 tibble_3.3.0
## [21] bslib_0.9.0 pillar_1.11.1 RColorBrewer_1.1-3 rlang_1.1.6
## [25] utf8_1.2.6 cachem_1.1.0 stringi_1.8.7 xfun_0.53
## [29] sass_0.4.10 S7_0.2.0 cli_3.6.5 withr_3.0.2
## [33] digest_0.6.37 grid_4.5.1 rstudioapi_0.17.1 lifecycle_1.0.4
## [37] vctrs_0.6.5 evaluate_1.0.5 glue_1.8.0 farver_2.1.2
## [41] colorspace_2.1-1 rmarkdown_2.29 tools_4.5.1 pkgconfig_2.0.3
## [45] htmltools_0.5.8.1